# ASF case study (K-functions code, Section 6.5)
# R code for generating the output presented in Section 6.5 of the Final Report (K functions)
# Written by Stephen Catterall
# Last edited 16th June 2016
# To run the code successfully, you need to specify a working directory below at line 9
# The working directory should contain all relevant data files, as well as this file and the baseline.R file
# Note: many of the procedures used in this code rely on random number generators, so the outputs produced should be *very similar* to those contained in the report, but may not be *identical*

#workingd<-"" # specify working directory here
#setwd(workingd)
require(splancs) # load R package for computing K-functions
source("baseline.R") # read code for the baseline methods

# Each of the three code sections below (6.5.2, 6.5.4, 6.5.5) is self-contained. They can be run in sequence or you can just choose one section and run it independently.
# WARNING: running the baseline methods on the large simulated datasets (500 cases or full set) can be very time consuming! Some of the simulated data sets took approximately 6 hours to process on my desktop computer.

#######################################################################################################################################
# 6.5.2 ABSOLUTE ASSESSMENT
#######################################################################################################################################
# Note: this section uses 120 simulated data sets, with filenames as follows:
# results_i.RData where i runs from 1 up to 30; these 30 datasets contain the original (full sampling) simulated outbreaks
# i=1-5 are simulations of the epidemic scenario with a wide kernel 
# i=6-10 are simulations of the low-level endemic scenario with a wide kernel
# i=11-15 are simulations of the high-level endemic scenario with a wide kernel
# i=16-20 are simulations of the epidemic scenario with a narrow kernel
# i=21-25 are simulations of the low-level endemic scenario with a narrow kernel
# i=26-30 are simulations of the high-level endemic scenario with a narrow kernel
#
# results_500_i.RData (i=1..30) are the corresponding data sets sampled to 500 cases
# results_100_i.RData (i=1..30) are the corresponding data sets sampled to 100 cases
# results_50_i.RData (i=1..30) are the corresponding data sets sampled to 50 cases

mc_test_results=array(0,dim=c(4,30)) # stores the Monte Carlo test results (4 sampling levels i.e. full, 500, 100, 50; 30 simulations as detailed above)

ssize=c(1000,500,100,50)
for(j in 1:4)
{
  samplinglevel=ssize[j]
  for(index in 1:30) 
  {
    cat("\nIndex=",index)
    # Load simulated case data
    if(samplinglevel<1000)
    {
      fname=paste("results_",samplinglevel,"_",index,".RData",sep="")
      load(fname)
      infections<-reduced.infections
    }
    if(samplinglevel==1000)
    {
      fname=paste("results_",index,".RData",sep="")
      load(fname)
    }
    # Prepare data for analysis by K-functions
    table<-data.frame(x=infections$x,y=infections$y,t=infections$time)
    box=bbox(as.points(table$x,table$y))
    box[,1]=box[,1]-0.001 # avoid problems with cases on the boundary of the region of interest
    box[,2]=box[,2]+0.001 # avoid problems with cases on the boundary of the region of interest
    bounding.box<- bboxx(box) # define a rectangle surrounding the cases in space
    # Call the stkhat function in the splancs package, which computes K-functions
    # Note: seq(0.01,0.2,0.01) is the sequence of distances at which the K-function is computed, while seq(0.05,1,0.05) is the sequence of times at which the K-function is computed
    data.stkhat<-stkhat(as.points(table$x,table$y),table$t,bounding.box,c(min(table$t),max(table$t)),seq(0.01,0.2,0.01), seq(0.05,1,0.05))
    # Call the stmctest function in the splancs package, which performs the asssociated Monte Carlo test for space-time interaction
    mc=stmctest(as.points(table$x,table$y),table$t,bounding.box,c(min(table$t),max(table$t)),seq(0.01,0.2,0.01),seq(0.05,1,0.05),nsim=99,returnSims=F,quiet=T)
    # Store output from Monte Carlo test
    mc_test_results[j,index]=rank(c(mc$t0,mc$t))[1]
    # Generate figures
    if(samplinglevel<1000) stout=paste("Kst_",samplinglevel,"_",index,".png",sep="")
    if(samplinglevel==1000) stout=paste("Kst_",index,".png",sep="")
    png(stout)
    data.stkhat.D <- data.stkhat$kst - outer(data.stkhat$ks,data.stkhat$kt)
    D0=data.stkhat.D/outer(data.stkhat$ks,data.stkhat$kt)
    image(t(D0),col=grey(c(0.95,0.8,0.7,0.6,0.4,0.3,0.2,0.1)),breaks=c(-1.1,-0.1,0,0.05,0.1,0.2,0.5,1.0,101),x=data.stkhat$t,y=data.stkhat$s,xlab="Time",ylab="Distance")
    dev.off()
  }
}
# Figures 38-45 in the report can be generated from the output files produced e.g. Figure 38 is based on Kst_i.png for i=1..15
# Table 23 can be generated from the percentages contained in mc_test_results e.g. the top left percentage in Table 23 is obtained by taking the subarray mc_test_results[1,1:5] and computing the percentage of values in this subarray that exceed 95%.


#######################################################################################################################################
# 6.5.4 DIRECT DATA-BASED COMPARISON
#######################################################################################################################################
# Note: this section uses the ASF data in the data file ASF.RData
#
# 1. Compute K-functions
#
# See report for a precise description of this data set
load("ASF.RData")
# Approximate conversion from latitude/longitude to km
table2$x=table2$x*111
table2$y=table2$y*79
# Prepare data for analysis by K-functions
box=bbox(as.points(table2$x,table2$y))
bounding.box<- bboxx(box)
# Call the stkhat function in the splancs package, which computes K-functions
data.stkhat<-stkhat(as.points(table2$x,table2$y),table2$t,bounding.box,c(min(table2$t),max(table2$t)),seq(10,200,10), seq(10,200,10))
D <- data.stkhat$kst - outer(data.stkhat$ks,data.stkhat$kt)
D0=D/outer(data.stkhat$ks,data.stkhat$kt)
# Generate output (this is Figure 46, left)
png("Kst_asf.png")
image(t(D0),col=grey(c(0.95,0.8,0.7,0.6,0.4,0.3,0.2,0.1)),breaks=c(0.5,1,1.25,1.5,1.75,2,3,5,15),x=data.stkhat$t,y=data.stkhat$s,xlab="Time [days]",ylab="Radius [km]")
dev.off()
#
# 2. Compute Lange et al. 2014 statistics
#
load("ASF.RData")
table<-data.frame(x=table2$x,y=table2$y,t=table2$t)
test <- endemicity.test(table, dist=seq(5,200,5), time=seq(5,200,5), num.permutations=499, format="ll") # call baseline model code
# Generate graphical output
png("Lange_pend_asf.png")
# Figure 46, centre
image(t(test$orig),col=grey(1-0.1*0:9),breaks=c(-1,0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1.01),x=test$time,y=test$dist,xlab="Time [days]",ylab="Radius [km]") 
dev.off()
png("Lange_test_asf.png")
# Figure 46, right
image(t(test$quantile),col=grey(c(0.9,0.7,0.4,0.1)),breaks=c(-1,1,5,10,101),x=test$time,y=test$dist,xlab="Time [days]",ylab="Radius [km]") 
dev.off()

#######################################################################################################################################
# 6.5.5 SIMULATION-BASED COMPARISON
#######################################################################################################################################
# Note: this section focusses on the 30 simulated datasets from 6.5.2 with sample size 500 cases - these are the simulations for which results are presented graphically in the Final Report. 
# However, the hypothesis testing results are presented for all sampling levels i.e. for all simulations described in Section 6.5.2
# The figures containing K-function framework results (Figure 47, Figure 50) are copies of those produced in Section 6.5.2
# The table containing K-function framework results (Table 25) is a copy of the one produced in Section 6.5.2
# So, in this section we just apply the baseline model to the simulated datasets (of sample size 500)
# WARNING: running the baseline methods on the large simulated datasets (500 cases or full set) can be very time consuming! Some of the simulated data sets took approximately 6 hours to process on my desktop computer.

Lange_omnibus_test_results=array(0,dim=c(4,30)) # stores the omnibus test results (4 sampling levels i.e. full, 500, 100, 50; 30 simulations as detailed above in 6.5.2)
ssize=c(1000,500,100,50)

for(j in 1:4)
{
  samplinglevel=ssize[j]
  for(index in 1:30)
  {
    cat("\nIndex=",index)
    # Load simulated case data
    if(samplinglevel<1000)
    {
      fname=paste("results_",samplinglevel,"_",index,".RData",sep="")
      load(fname)
      infections<-reduced.infections
    }
    if(samplinglevel==1000)
    {
      fname=paste("results_",index,".RData",sep="")
      load(fname)
    }
    table<-data.frame(x=infections$x,y=infections$y,t=infections$time)
    test <- endemicity.test(table, dist=seq(0.01,0.2,0.01), time=seq(0.05,1,0.05), num.permutations=499, format="xy") # call baseline model code
    # Create graphical outputs
    if(samplinglevel<1000)
    {
      rawout=paste("Lange_pend_",samplinglevel,"_",index,".png",sep="")
      testout=paste("Lange_test_",samplinglevel,"_",index,".png",sep="")
    }
    if(samplinglevel==1000)
    {
      rawout=paste("Lange_pend_",index,".png",sep="")
      testout=paste("Lange_test_",index,".png",sep="")
    }
    png(testout)
    image(t(test$quantile),col=grey(c(0.9,0.7,0.4,0.1)),breaks=c(-1,1,5,10,101),x=test$time,y=test$dist,xlab="Time",ylab="Radius")
    dev.off()
    png(rawout)
    image(t(test$orig),col=grey(1-0.1*0:9),breaks=c(-1,0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1.01),x=test$time,y=test$dist,xlab="Time",ylab="Radius")
    dev.off()
    # Output omnibus test results
    torig=sum(test$orig) # test statistic for original simulated dataset
    Lange_omnibus_test_results[j,index]=rank(c(torig,test$psum))[1] # rank of test statistic for original simulated dataset within vector of test statistics for the 499 permuted simulated datasets
  }
}
# Figures 47 and 50 in the report are copies of Figures 39 and 43 respectively.
# Figures 48, 49, 51, 52 are generated from the graphical outputs produced above. Figure 48 is generated from Lange_test_500_i.png i=1..15. Figure 49 is generated from Lange_pend_500_i.png i=1..15.
# Figure 51 is generated from Lange_test_500_i.png i=16..30. Figure 52 is generated from Lange_pend_500_i.png i=16..30.
# Table 24 can be generated from the ranks contained in Lange_omnibus_test_results e.g. consider the 60% value in the Epidemic/Wide column of Table 24. Epidemic/Wide corresponds to simulations i=1..5 in 6.5.2 and the 60% value is in the 100 sampling level row, so look at Lange_omnibus_test_results[3,1:5]. The 60% value means that three of the five ranks in Lange_omnibus_test_results[3,1:5] are <=25. (Rank<=25 means that the test statistic for the original dataset is in the bottom 5% of the test statistics for the permuted datasets i.e. this is when we reject the null hypothesis.)


